rm(list = ls())
gc()

library(lubridate)
library(foreign)
library(ggplot2)
library(ggthemes)
library(estimatr)
library(naniar)
library(tidyverse)

source("functions/pca_fun.R")

df = readRDS("data/svy.rds")
df$hh_inc_stable




# Mean imputation ---------------------------------------------------------
for(i in 1:ncol(df)){
  cl = df[,i] %>% class
  if(cl != "numeric"){
    print(names(df)[i])
  }else{
    df[is.na(df[,i]), i] = mean(df[,i], na.rm = TRUE)
  }
}

# Econ well being in Lebanon ----------------------------------------------------
df$head_work_legal %>% table(useNA = "always")
df$head_work_hours_4 %>% table(useNA = "always")
df$head_income %>% table(useNA = "always")
df$hh_inc_source_aid %>% table(useNA = "always")
df$aid_atm %>% table(useNA = "always")
df$aid_wfp %>% table(useNA = "always")
df$aid_wfp %>% table(useNA = "always")
df$hh_leb_2011 %>% table(useNA = "always")
df$hh_inc_stable %>% table(useNA = "always")
df$aid_chg %>% table(useNA = "always")
df$own_items_1 %>% table(useNA = "always")
df$own_items_2 %>% table(useNA = "always")
df$own_items_3 %>% table(useNA = "always")
df$own_items_4 %>% table(useNA = "always")
df$own_items_5 %>% table(useNA = "always")
df$own_items_6 %>% table(useNA = "always")
df$own_items_7 %>% table(useNA = "always")
df$own_items_8 %>% table(useNA = "always")
df$own_items_9 %>% table(useNA = "always")
df$own_items_10 %>% table(useNA = "always")
df$own_items_11 %>% table(useNA = "always")
df$expenses_ability %>% table(useNA = "always")
df$assets_value %>% table(useNA = "always")
df$hh_income %>% table(useNA = "always")

econ_well_being_leb = c("head_work_legal", "head_work_days_4", "head_work_hours_4", "head_income",
                        "hh_inc_source_aid", "aid_atm", "aid_wfp", "hh_leb_2011", "hh_inc_stable", 
                        "aid_chg", "own_items_1", "own_items_2", "own_items_3", "own_items_4", "own_items_5",
                        "own_items_6", "own_items_7", "own_items_8", "own_items_10", "own_items_11",
                        "expenses_ability", "assets_value", "assets_months", "hh_income")

df[,econ_well_being_leb] %>% summary


data_pc = matrix(nrow = length(econ_well_being_leb), ncol = 11)
data_pc[,1] = 1:nrow(data_pc)


df$index_econ_leb = PCA_extract(vars = df[, econ_well_being_leb], cor_type = "pear")[[2]] %>% as.vector()
PCA_extract(vars = df[, econ_well_being_leb], cor_type = "pear")[[1]]

# Social wellbeing in Lebanon ----------------------------------------------------
df$intg_connected %>% table(useNA = "always")
df$intg_outsider %>% table(useNA = "always")
df$enum_arabic %>% table(useNA = "always")
df$head_literate %>% table(useNA = "always")
df$intg_politics %>% table(useNA = "always")
df$intg_discuss %>% table(useNA = "always")
df$meal_share %>% table(useNA = "always")
df$intg_access_doctor %>% table(useNA = "always")
df$intg_access_job %>% table(useNA = "always")
df$length_stay %>% table(useNA = "always")

df$ipl_connected = 6 - df$intg_connected
df$ipl_outsider = 6 - df$intg_outsider
df$ipl_arabic = 4 - df$enum_arabic #reversing order of variable
df$ipl_literate = 4 - df$head_literate #reversing order of variable
df$ipl_politics = 6 - df$intg_politics #reversing order of variable
df$ipl_discuss = df$intg_discuss
df$ipl_meal = df$meal_share
df$ipl_lebanese = df$lebanese_contacts
df$ipl_doctor = 6 - df$intg_access_doctor #reversing order of variable
df$ipl_job = 6 - df$intg_access_job #reversing order of variable
df$ipl_syrian = df$syrian_contacts
df$length_stay %>% table(useNA = "always")

df = df %>% 
  mutate(length_stay2 = cut(length_stay, breaks = c(quantile(length_stay, na.rm = T)), include.lowest = T),
         length_stay2 = as.numeric(length_stay2)) %>% 
  ungroup()

df$length_stay = df$length_stay2
df$length_stay2 = NULL

df$curfews_never %>% table(useNA = "always")
df$no_curfews_now %>% table(useNA = "always")
df$public_treat_person %>% table(useNA = "always")

df$ind_public_treat = df$public_treat_person
df$ind_public_treat = 5 - df$ind_public_treat #reverse coding 
df$public_treat_person = NULL

df$auth_treat_person %>% table(useNA = "always")

df$ind_auth_treat = df$auth_treat_person
df$ind_auth_treat = 5 - df$ind_auth_treat #reverse coding 

df$not_detained %>% table(useNA = "always")
df$not_detained = df$detained
df$not_detained = 1 - df$not_detained #reverse coding
df$detained = NULL

df$no_disc_housing = df$disc_housing
df$mobility %>% table(useNA = "always")
df$disc_housing = NULL

df$ind_mobility = df$mobility
df$ind_mobility = 5 - df$ind_mobility #reverse coding

df$ind_mobility_hh = df$mobility_hh
df$ind_mobility_hh = 4 - df$ind_mobility_hh
df$mobility_hh = NULL

#Note: removed "ipl_meal", "ipl_lebanese" because they're in networks
soc_well_being_leb = c("ipl_connected", "ipl_outsider", "ipl_arabic", "ipl_literate", "ipl_politics", "ipl_discuss", 
                       "ipl_job", "length_stay", "curfews_never", 
                       "no_curfews_now", "ind_public_treat", "ind_auth_treat", "not_detained", "no_disc_housing", 
                       "ind_mobility", "ind_mobility_hh")

df[,soc_well_being_leb] %>% summary

df$index_soc_leb = PCA_extract(vars = df[, soc_well_being_leb], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, soc_well_being_leb], cor_type = "poly")[[1]]


# H3 services in Lebanon --------------------------------------------------
df$head_sick %>% table(useNA = "always")
df$head_not_sick = 1 - df$head_sick #reverse order 
df$head_sick = NULL

df$head_treated %>% table(useNA = "always")
df$disc_healthcare %>% table(useNA = "always")

df$towns_leb %>% table(useNA = "always")
df$stability_towns = df$towns_leb
df$stability_towns = with(df, ifelse(stability_towns == 1, 3,
                                         ifelse(stability_towns == 2, 2,
                                                ifelse(stability_towns > 2, 1, stability_towns))))
df$towns_leb = NULL

df$stability_current %>% table(useNA = "always")

df = df %>% 
  mutate(stability_current = cut(stability_current, breaks = quantile(stability_current, na.rm = T), include.lowest = T),
         stability_current = as.numeric(stability_current)) %>% 
  ungroup()

df$intg_access_legal %>% table(useNA = "always")
df$access_legal = 6 - df$intg_access_legal #reverse coding

df$person_nosick %>% table(useNA = "always")
df$person_treat %>% table(useNA = "always")

df$no_need_school %>% table(useNA = "always")
df$scl_not_preventive %>% table(useNA = "always")

services_lebanon = c("head_not_sick", "head_treated", "person_nosick", "person_treat", 
                     "ipl_doctor", "disc_healthcare", "no_need_school", 
                     "stability_towns", "stability_current", "scl_not_preventive", "access_legal", "own_items_9")
summary(df[,services_lebanon])

df$index_services_leb = PCA_extract(vars = df[, services_lebanon], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, services_lebanon], cor_type = "poly")[[1]]


# H4: Legal well-being ----------------------------------------------------
df$resident %>% table(useNA = "always")
df$registered %>% table(useNA = "always")

legal_lebanon = c("registered", "resident")
df[,legal_lebanon] %>% summary

df$index_legal_leb = PCA_extract(vars = df[, legal_lebanon], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, legal_lebanon], cor_type = "poly")[[1]]

# H5 Syria safety ---------------------------------------------------------
df$risk_urban %>% table(useNA = "always")
df$safety_current = df$risk_urban
df$risk_urban = NULL

df$protests %>% table(useNA = "always")
df$head_violence %>% table(useNA = "always")

df$exposure_violence = df$head_violence
df$head_violence = NULL 

df$conscription %>% table(useNA = "always")
df$safe_future = df$safety_chg
df$safe_future = 5 - df$safe_future #reverse coding
df$safety_chg = NULL

df$info_trust_arabiya %>% table(useNA = "always")
df$info_trust_jazeera %>% table(useNA = "always")
df$info_trust_mayadeen %>% table(useNA = "always")
df$info_trust_manar %>% table(useNA = "always")

df$info_oppsn = rowSums(df[, c("info_trust_arabiya", "info_trust_jazeera")], na.rm = T)/2
df$info_regime = rowSums(df[, c("info_trust_mayadeen", "info_trust_manar")], na.rm = T)/2

df$oppsn_sympathy = ifelse(df$info_oppsn > df$info_regime, 1, 0)

control_syria = c("control_now_regime", "control_now_oppsn", "control_now_russia", "control_now_turkey",
                  "control_now_kurds", "contested_now", "control_past_oppsn", "control_past_russia", 
                  "control_past_regime", "control_past_kurds", "control_past_turkey", "contested_past", "isis_control")

safety_syria = c("safety_current", "oppsn_sympathy", "protests", "exposure_violence",
                 "safe_future", "conscription")

df[,safety_syria] %>% summary
df[,control_syria] %>% summary

df$index_safety_syria = PCA_extract(vars = df[, safety_syria], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, safety_syria], cor_type = "poly")[[1]]

df$index_control_syria = PCA_extract(vars = df[, control_syria], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, control_syria], cor_type = "poly")[[1]]

df$index_regime_control = - df$index_control_syria
df$index_control_syria = NULL

# H6 Econ Syria --------------------------------------------------------------
df$jobs_origin %>% table(useNA = "always")
df$syr_debt %>% table(useNA = "always")

df$debt_fin = NA
df$debt_fin[df$syr_debt == 0] = 0
df$debt_fin[df$syr_debt %in% c(1,2)] = 1
df$debt_fin[df$syr_debt > 0 & df$syr_debt < 1] = 1
df$debt_fin[df$syr_debt > 2] = 2

df$debt_fin %>% table(useNA = "always")
df$syr_debt = NULL

df$syr_property_1 %>% table(useNA = "always")
df$syr_property_2 %>% table(useNA = "always")
df$syr_property_3 %>% table(useNA = "always")

df$syr_land_future %>% table(useNA = "always")
df$syr_home_doc %>% table(useNA = "always")

econ_syria = c("jobs_origin", "debt_fin", "syr_property_1", "syr_property_2", "syr_property_3", 
               "syr_land_future", "syr_home_doc")

summary(df[,econ_syria])

df$index_econ_syria = PCA_extract(vars = df[, econ_syria], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, econ_syria], cor_type = "poly")[[1]]

# Services Syria ----------------------------------------------------------
df$elect_origin %>% table(useNA = "always")
df$services_syr_elect = 6 - df$elect_origin
df$elect_origin = NULL

df$water_origin %>% table(useNA = "always")
df$services_syr_water = 6 - df$water_origin
df$water_origin = NULL

df$schools_origin %>% table(useNA = "always")
df$services_syr_scl = df$schools_origin
df$schools_origin = NULL

df$health_origin %>% table(useNA = "always")
df$services_syr_health = df$health_origin
df$health_origin = NULL

df$services_chg %>% table(useNA = "always")
df$services_syr_improve = 5 - df$services_chg
df$services_chg = NULL

services_syria = c("services_syr_elect", "services_syr_water", "services_syr_scl", 
                   "services_syr_health", "services_syr_improve")

summary(df[,services_syria])

df$index_services_syria = PCA_extract(vars = df[, services_syria], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, services_syria], cor_type = "poly")[[1]]


# Networks Syria ----------------------------------------------------------------
df$syr_stay %>% table(useNA = "always")
df$relatives_rtrn1 %>% table(useNA = "always")
df$relatives_rtrn2 %>% table(useNA = "always")
df$hh_rtrn %>% table(useNA = "always")

networks_syria = c("syr_stay", "relatives_rtrn1", "relatives_rtrn2", "hh_rtrn")
summary(df[,networks_syria])

df$index_networks_syria = PCA_extract(vars = df[, networks_syria], cor_type = "pear")[[2]] %>% as.vector()
PCA_extract(vars = df[, networks_syria], cor_type = "pear")[[1]]


# Networks Lebanon --------------------------------------------------------
df$hh_syr_leb %>% table(useNA = "always")
df$leban_relatives %>% table(useNA = "always")

df$leban_relatives2 = recode(df$leban_relatives, "3" = 0, "2" = 1, "1" = 2, .default = NA_real_)
df$leban_relatives2[is.na(df$leban_relatives2)] = 0
df$leban_relatives2 %>% table(useNA = "always")

networks_lebanon = c("hh_syr_leb", "ipl_lebanese", "ipl_syrian", "leban_relatives2", "ipl_meal")
summary(df[,networks_lebanon])

df$index_networks_lebanon = PCA_extract(vars = df[, networks_lebanon], cor_type = "pear")[[2]] %>% as.vector()
PCA_extract(vars = df[, networks_lebanon], cor_type = "pear")[[1]]

# Information 9.1 ---------------------------------------------------------
df$know_safe %>% table(useNA = "always")
df$know_jobs %>% table(useNA = "always")
df$know_services %>% table(useNA = "always")
df$know_conscription %>% table(useNA = "always")

df$info_safe = ifelse(df$know_safe == 1, 1, ifelse(df$know_safe %in% c(2, 3), 0, NA))
df$info_jobs = ifelse(df$know_jobs == 1, 1, ifelse(df$know_jobs %in% c(2, 3), 0, NA))
df$info_services = ifelse(df$know_services == 1, 1, ifelse(df$know_services %in% c(2, 3), 0, NA))
df$info_conscription = ifelse(df$know_conscription == 1, 1, ifelse(df$know_conscription %in% c(2, 3), 0, NA))

df$info_safe[is.na(df$info_safe)] = 0
df$info_jobs[is.na(df$info_jobs)] = 0
df$info_services[is.na(df$info_services)] = 0
df$info_conscription[is.na(df$info_conscription)] = 0

df$info_safe %>% table(useNA = "always")
df$info_jobs %>% table(useNA = "always")
df$info_services %>% table(useNA = "always")
df$info_conscription %>% table(useNA = "always")

df$info_syr_rln %>% table(useNA = "always")

df$communicate_freq_urb %>% table
df$info_syr_communicate = df$communicate_freq_urb
df$info_syr_communicate = 7 - df$info_syr_communicate

info_quality = c("info_safe", "info_jobs", "info_services", "info_conscription", "info_syr_rln", "info_syr_communicate")
summary(df[,info_quality])

df$index_info_quality = PCA_extract(vars = df[, info_quality], cor_type = "poly")[[2]] %>% as.vector()
PCA_extract(vars = df[, info_quality], cor_type = "poly")[[1]]

# Mobility costs ----------------------------------------------------------
# switch source
mobility = read.csv("data/google_maps_travel_distance_output.csv", stringsAsFactors = F)

mobility = mobility %>% select(response_num, log_travel_distance) %>% rename(index_mobility = log_travel_distance)

df = df %>% left_join(mobility)

df$index_family = log(df$hh_size + 1) #When we asked for household size, we excluded the head of household

indices = df %>% 
  select(c(response_num, names(df)[str_detect(names(df), "^index")]))

indices %>% write.csv("out/indices_mean.csv", row.names = F)

# Fixing outcomes -------------------------------------------------------
df$rtrn_head_12 = recode(df$rtrn_12mon, "1" = 1, "2" = 0, "3" = 0, .default = NA_real_)
df$rtrn_12mon %>% table
df$rtrn_head_12[is.na(df$rtrn_head_12)] = 0

df$rtrn_head_ever %>% summary

df$rtrn_ever %>% table(useNA = "always")
df$rtrn_head_never = recode(df$rtrn_ever, "1" = 0, "2" = 0, "3" = 1, .default = NA_real_)
df$rtrn_head_never[is.na(df$rtrn_head_ever)] = 0

df$rtrn_head_never[df$rtrn_head_12==1] %>% table

df$rtrn_head_2years = df$expect2yr_syria

df$rtrn_resources %>% table(useNA = "always")
df$prep_resources = df$rtrn_resources
df$rtrn_resources = NULL

df$rtrn_docs %>% table(useNA = "always")
df$prep_docs = df$rtrn_docs
df$rtrn_docs = NULL

df$rtrn_author %>% table(useNA = "always")
df$prep_author = df$rtrn_author
df$rtrn_author = NULL

df$rtrn_unhcr %>% table(useNA = "always")
df$prep_unhcr = df$rtrn_unhcr
df$rtrn_unhcr = NULL

df$rtrn_scope %>% table(useNA = "always")
df$prep_scope = df$rtrn_scope
df$rtrn_scope = NULL

df$rtrn_abort %>% table(useNA = "always")
df$prep_abort = df$rtrn_abort
df$rtrn_abort = NULL

vars = c("prep_resources", "prep_docs", "prep_author", "prep_unhcr", "prep_scope", "prep_abort")
df$prep_resources %>% table(useNA = "always")
df$prep_docs %>% table(useNA = "always")
df$prep_author %>% table(useNA = "always")
df$prep_unhcr %>% table(useNA = "always")
df$prep_scope %>% table(useNA = "always")
df$prep_abort %>% table(useNA = "always")
df[,vars] %>% summary

df$prep_pca = PCA_extract(vars = df[, vars], cor_type = "pear")[[2]] %>% as.vector()
PCA_extract(vars = df[, vars], cor_type = "pear")[[1]]

df$prep_pca %>% summary

# Fixing covariates (control variables) -------------------------------------------------------
df$syr_origin %>% table(useNA = "always")
df$syr_origin %>% class

df$syr_origin_urban %>% table(useNA = "always")

df$head_educ %>% table(useNA = "always")

df$head_educ2 = recode(df$head_educ, "6" = 1, "5" = 1, "4" = 1, "3" = 0, "2" = 0, "1" = 0, .default = NA_real_)
df$head_educ2[is.na(df$head_educ2)] = 0
df$head_educ = NULL

df$sick %>% table(useNA = "always")
df$toddler %>% table(useNA = "always")
df$elderly %>% table(useNA = "always")
df$single_parent %>% table(useNA = "always")
df$female_headed_hh %>% table(useNA = "always")
df$head_female %>% table(useNA = "always")

df$syr_origin %>% table(useNA = "always")
df$syr_origin_urban %>% table(useNA = "always")
df$location_its %>% table(useNA = "always")
df$sick %>% table(useNA = "always")
df$head_educ2 %>% table(useNA = "always")
df$toddler %>% table(useNA = "always")
df$elderly %>% table(useNA = "always")
df$female_headed_hh %>% table(useNA = "always")
df$single_parent
df$single_parent_final %>% table(useNA = "always")
df$head_female %>% table(useNA = "always")
df$location_district %>% table(useNA = "always")
df[,vars] %>% summary

hezb = readRDS("data/hezb_control.rds")
hezb$response_num %>% summary
hezb$hezb_control %>% table

df = df %>% left_join(hezb)
df$hezb_control %>% table

df$hezb_control %>% table(useNA = "always")

covars = c("syr_origin", "syr_origin_urban", "location_its", "sick", "head_educ2", "toddler", "elderly", "female_headed_hh", "location_district", "hezb_control")
df[,covars] %>% summary


# Additive indices --------------------------------------------------------
df3 = df
standardize = function(x){
  x = (x - mean(x)) / sd(x)
}

#Econ Lebanon
df3[, econ_well_being_leb] = df3[,econ_well_being_leb] %>% map_df(standardize)
df3$additive_econ_leb = rowSums(df3[,econ_well_being_leb])
df3$additive_econ_leb = df3$additive_econ_leb %>% standardize


#Social Lebanon
df3[, soc_well_being_leb] = df3[, soc_well_being_leb] %>% map_df(standardize)
df3$additive_soc_leb = rowSums(df3[, soc_well_being_leb])
df3$additive_soc_leb = df3$additive_soc_leb %>% standardize

#Services Lebanon
df3[, services_lebanon] = df3[, services_lebanon] %>% map_df(standardize)
df3$additive_services_leb = rowSums(df3[, services_lebanon])
df3$additive_services_leb = df3$additive_services_leb %>% standardize

#Legal Lebanon
df3[, legal_lebanon] = df3[, legal_lebanon] %>% map_df(standardize)
df3$additive_legal_leb = rowSums(df3[, legal_lebanon])
df3$additive_legal_leb = df3$additive_legal_leb %>% standardize

#Networks Lebanon
df3[, networks_lebanon] = df3[, networks_lebanon] %>% map_df(standardize)
df3$additive_networks_lebanon = rowSums(df3[, networks_lebanon])
df3$additive_networks_lebanon = df3$additive_networks_lebanon %>% standardize

#Safety Syria
df3[, safety_syria] = df3[, safety_syria] %>% map_df(standardize)
df3$additive_safety_syria = rowSums(df3[, safety_syria])
df3$additive_safety_syria = df3$additive_safety_syria %>% standardize


#Econ Syria
df3[, econ_syria] = df3[, econ_syria] %>% map_df(standardize)
df3$additive_econ_syria = rowSums(df3[, econ_syria])
df3$additive_econ_syria = df3$additive_econ_syria %>% standardize

#Services Syria
df3[, services_syria] = df3[, services_syria] %>% map_df(standardize)
df3$additive_services_syria = rowSums(df3[, services_syria])
df3$additive_services_syria = df3$additive_services_syria %>% standardize

#Networks Syria
df3[, networks_syria] = df3[, networks_syria] %>% map_df(standardize)
df3$additive_networks_syria = rowSums(df3[, networks_syria])
df3$additive_networks_syria = df3$additive_networks_syria %>% standardize

#Information Quality
df3[, info_quality] = df3[, info_quality] %>% map_df(standardize)
df3$additive_info_quality = rowSums(df3[, info_quality])
df3$additive_info_quality = df3$additive_info_quality %>% standardize

df3$additive_info_quality %>% summary
df3$additive_mobility = df3$index_mobility
df3$additive_family = df3$index_family

#Exluding regime control bcz it doesnt make sense to have an additive index of this
df3 = df3 %>% select(response_num, names(df3)[str_detect(names(df3), "additive")])

df4 = df %>% left_join(df3)

# switch source
df4 %>% saveRDS("data/baseline_clean_mean.rds")
